{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import datetime\n",
    "import pandas as pd\n",
    "from sklearn import preprocessing\n",
    "\n",
    "ts_csv_path = 'data/test_FE.csv'\n",
    "tr_csv_path = 'data/train_FE.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4577464, 20)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test  = pd.read_csv(ts_csv_path, dtype={'id': 'U'}, index_col='id')\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40428967, 20)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(tr_csv_path, dtype={'id': 'U'}, index_col='id')\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts = pd.concat([test, train], copy=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# app_id和site_id对应的用户规模\n",
    "import json\n",
    "app_id_users = json.load(open(\"data/app_id_users_dict.json\"))\n",
    "site_id_users = json.load(open(\"data/site_id_users_dict.json\"))\n",
    "tr_ts['app_id_users'] = tr_ts.C_app_id.apply(lambda x: app_id_users[str(x)])\n",
    "tr_ts['site_id_users'] = tr_ts.C_site_id.apply(lambda x: site_id_users[str(x)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 所有类别变量命名C_N，便于生成ffm格式数据。\n",
    "C_fields = [ 'hour', 'banner_pos', 'site_category', 'app_domain', 'app_category',\n",
    "            'device_conn_type', 'C14', 'C18', 'C19', 'C20','C21', 'is_device', 'C_app_id', 'C_site_id', \n",
    "            'C_site_domain', 'C_device_model', 'C_pix', 'C_device_type_1']\n",
    "new_name_dict = {}\n",
    "for f, column in enumerate(C_fields):\n",
    "    new_name_dict[column] = \"C{0}\".format(f)\n",
    "tr_ts.rename(columns=new_name_dict, inplace=True)\n",
    "tr_ts.drop('device_ip', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts.iloc[:test.shape[0],].to_csv('data/ts_FE.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 训练数据和验证数据50%随机采样\n",
    "import numpy as np\n",
    "sample_pct = 0.5\n",
    "np.random.seed(999)\n",
    "r1 = np.random.uniform(0, 1, train.shape[0])  #产生0～40M的随机数\n",
    "train_FE_1 = tr_ts.iloc[test.shape[0]:, :]\n",
    "va = train_FE_1.iloc[r1 <= sample_pct, :]\n",
    "tr = train_FE_1.iloc[r1 > sample_pct, :]\n",
    "tr.to_csv('data/tr_50_FE.csv')\n",
    "va.to_csv('data/va_50_FE.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C0': 24,\n",
       " 'C1': 7,\n",
       " 'C2': 26,\n",
       " 'C3': 580,\n",
       " 'C4': 36,\n",
       " 'C5': 4,\n",
       " 'C6': 2885,\n",
       " 'C7': 4,\n",
       " 'C8': 69,\n",
       " 'C9': 172,\n",
       " 'C10': 62,\n",
       " 'C11': 2,\n",
       " 'C12': 3269,\n",
       " 'C13': 2763,\n",
       " 'C14': 2861,\n",
       " 'C15': 3355,\n",
       " 'C16': 10,\n",
       " 'C17': 9}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成每个类别对应的取值数，生成ffm格式数据时需要用到\n",
    "C_count_dict = {}\n",
    "for f, column in enumerate(tr_ts.columns):\n",
    "    if column.startswith('C'):\n",
    "        C_count_dict[column] = tr_ts[column].value_counts().shape[0]\n",
    "json.dump(C_count_dict, open(\"data/C_count_dict.json\", \"w\"))\n",
    "C_count_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
